library(downloader)
download(url = "https://raw.githubusercontent.com/jennybc/gapminder/master/inst/gapminder.tsv", destfile = "gapminder.tsv")
#Bring in the data as data frame
library(readr)
gapminder <- read.delim("gapminder2.tsv")
str(gapminder)
#Descriptive plots
#remove the oc
library(ggplot2)
library(dplyr)
freq <- as.data.frame(table(gapminder$continent))
names(freq) <- c("continent","obs")
kable(freq,format = "markdown")
barplot(table(gapminder$continent))
p1 <- ggplot(gapminder,aes(x=lifeExp)) + geom_histogram()+ggtitle("lifeExp distribution")
p2 <- ggplot(gapminder,aes(x=gdpPercap)) + geom_histogram()+ggtitle("gdpPercap distribution")
p3 <- ggplot(gapminder,aes(x=continent, y=gdpPercap)) + geom_boxplot(aes(fill=continent)) +
scale_y_log10() + ggtitle("log(gdpPercap) distribution in different continents")
p4 <- ggplot(gapminder,aes(x=year, y=gdpPercap)) +
geom_point(aes(color = country)) +
geom_smooth(lwd = 2, se= FALSE, method = "lm") +
scale_y_log10() + facet_grid(~continent) +
ggtitle("log(gdpPercap) growth in different continent")
ggsave(file="lifeExp_dist.png",plot=p1)
ggsave(file="gdpPercap_dist.png",plot=p2)
ggsave(file="gdpPercap_dist_continent.png",plot=p3)
ggsave(file="gdpPercap_growth_continent.png",plot=p4)
# Reorder the continents based on life expectancy according to the cross-country
# difference of lifeExp in each continent 2007.
gap <- gapminder %>%
filter(year==2007) %>%
group_by(continent) %>%
summarise(diff = max(lifeExp)-min(lifeExp))
gap <- full_join(gapminder,gap)
gapminder <- gap %>%
mutate(continent=reorder(continent,diff))
levels(gapminder$continent)
# sort the data according to continent, country, and year variable
sort <- gapminder[order(gapminder$continent,gapminder$country, gapminder$year),]
write_tsv(sort,"gap_sort.tsv")
diff into gapminder data in order to quickly reorder continent when reloading the file.Descriptive plots:
# import the data from download.R
library(readr)
gapminder <- read.delim("gap_sort.tsv") #I want continents treated as factors
str(gapminder)
#make sure new continent order is still in force
gapminder <- gapminder %>%
mutate(continent=reorder(continent,diff))
levels(gapminder$continent)
library(plyr)
library(dplyr)
# linear regression of life expectancy on year
reg <- ddply(gapminder, ~ country + continent,fn <- function(dat, offset = 1952) {
the_fit <- lm(lifeExp ~ I(year - offset), dat)
sd <- var(residuals(the_fit), na.rm = TRUE)
setNames(c(coef(the_fit),sd), c("intercept", "slope","sd"))
})
reg <- reg[order(reg$continent,reg$country),]
#write results to file
write_tsv(reg,"reg_life_year.tsv")
# find the 3 or 4 “worst” and “best” countries for each continent
rank_int <- reg %>%
group_by(continent) %>%
filter(min_rank(intercept)<5 | min_rank(desc(intercept))<5)
write_tsv(rank_int,"reg_worst_best.tsv")
library(dplyr)
library(readr)
library(ggplot2)
gapminder <- read_tsv("gap_sort.tsv")
rank_int <- read_tsv("reg_worst_best.tsv")
pick <- semi_join(gapminder,rank_int,by="country")
pick$continent <- as.factor(pick$continent)
pick$country <- as.factor((pick$country))
str(pick)
for (i in levels(pick$continent)){
ggplot(pick %>% filter(continent==i),aes(x = year,y = lifeExp)) +
geom_smooth(se = TRUE, method = "lm") +
geom_point() + facet_wrap(~country)+ggtitle(paste("lifeExp changes in Best and Worst countries in",i,sep=" "))
k <- paste("lifeExp_growth_",i,".png",sep="")
ggsave(k)
}